import jsonlines
from tqdm import tqdm
datas = []

with jsonlines.open("7M_all_label/7M_label_tagged.jsonl") as f:
    for line in tqdm(f):
        datas.append(line)

merge_datas = []
with jsonlines.open("7M_all_label_normalize/infinity-instruct-7M-eng.jsonl") as f:
    for li, line in tqdm(enumerate(f)):
        while len(datas) != 0:
            raw_line = datas.pop(0)
            if str(raw_line["id"]) + raw_line["conversations"][-1]["value"] + raw_line["conversations"][-2]["value"] != str(line["id"]) + line["conversations"][-1]["value"] + line["conversations"][-2]["value"]:
                raw_line["label"] = {}
                merge_datas.append(raw_line)
                continue
            else:
                merge_datas.append(line)
                break

with jsonlines.open("7M_all_label_normalize/infinity-instruct-7M-eng-merge.jsonl","w") as wf:
    for line in tqdm(merge_datas):
        wf.write(line)
